Aging_Mice_GSE129788 Analysis

Aging_Mice_GSE129788

PAPER
“Single-cell transcriptomic profiling of the aging mouse brain” Isolated single cell from mice brain to perform single cell RNAseq 8 young and 8 old mice snRNAseq samples

Packages_Session

#---------- Knitr
setwd("/Volumes/projects_secondary/bras/Lee/ASAP/Aging_Mice_GSE129788")
knitr::opts_chunk$set(warning=FALSE, message=FALSE, cache=FALSE, error=FALSE, cache.lazy=FALSE)
#knitr::opts_knit$set(root.dir='/Volumes/projects_secondary/bras/Lee/ASAP/Aging_Mice_GSE129788')
#options(scipen=999)
#options(stringsAsFactors = FALSE)
# Chunk Options  
# include = FALSE, runs code, prevents code and results from appearing  
# echo = FALSE, runs code, prevents code from appearing but not the results  
# eval = FALSE, does not run code  
# message = FALSE prevents messages that are generated by code from appearing 
# warning = FALSE prevents warnings that are generated by code from appearing 
# fig.cap = "..." adds a caption to graphical results.

#---------- Packages
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("fgsea")
#install.packages('metap')
suppressPackageStartupMessages({
  library(betareg)
  library(biomaRt)
  library(corrplot)
  library(cowplot)
  library(DoubletFinder)
  library(dplyr)
  library(DT)
  library(fgsea)
  library(ggplot2)
  library(ggpubr)
  library(gprofiler2)
  library(knitr)
  library(kableExtra)
  library(lubridate)
  library(magrittr)
  library(MAST)
  library(metap)
  library(multtest)
  library(parallel)
  library(purrr)
  library(RColorBrewer)
  library(readr)
  library(reshape2)
  library(reticulate)
  library(rmarkdown)
  library(Seurat)
  library(stringr)
  library(tibble)
  library(tidyr)
  library(viridis)
})
## Warning: package 'GenomeInfoDb' was built under R version 4.0.4
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] viridis_0.5.1               viridisLite_0.3.0          
##  [3] tidyr_1.1.3                 tibble_3.1.0               
##  [5] stringr_1.4.0               SeuratObject_4.0.0         
##  [7] Seurat_4.0.0                rmarkdown_2.7              
##  [9] reticulate_1.18             reshape2_1.4.4             
## [11] readr_1.4.0                 RColorBrewer_1.1-2         
## [13] purrr_0.3.4                 multtest_2.46.0            
## [15] metap_1.4                   MAST_1.16.0                
## [17] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [19] Biobase_2.50.0              GenomicRanges_1.42.0       
## [21] GenomeInfoDb_1.26.4         IRanges_2.24.1             
## [23] S4Vectors_0.28.1            BiocGenerics_0.36.0        
## [25] MatrixGenerics_1.2.1        matrixStats_0.58.0         
## [27] magrittr_2.0.1              lubridate_1.7.10           
## [29] kableExtra_1.3.4            knitr_1.31                 
## [31] gprofiler2_0.2.0            ggpubr_0.4.0               
## [33] ggplot2_3.3.3               fgsea_1.16.0               
## [35] DT_0.17                     dplyr_1.0.5                
## [37] DoubletFinder_2.0.3         cowplot_1.1.1              
## [39] corrplot_0.84               biomaRt_2.46.3             
## [41] betareg_3.1-4              
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.1             tidyselect_1.1.0       RSQLite_2.2.4         
##   [4] AnnotationDbi_1.52.0   htmlwidgets_1.5.3      grid_4.0.3            
##   [7] BiocParallel_1.24.1    Rtsne_0.15             munsell_0.5.0         
##  [10] ica_1.0-2              codetools_0.2-18       mutoss_0.1-12         
##  [13] miniUI_0.1.1.1         future_1.21.0          withr_2.4.1           
##  [16] colorspace_2.0-0       rstudioapi_0.13        ROCR_1.0-11           
##  [19] tensor_1.5             ggsignif_0.6.1         listenv_0.8.0         
##  [22] Rdpack_2.1.1           GenomeInfoDbData_1.2.4 polyclip_1.10-0       
##  [25] mnormt_2.0.2           bit64_4.0.5            parallelly_1.24.0     
##  [28] vctrs_0.3.6            generics_0.1.0         TH.data_1.0-10        
##  [31] xfun_0.22              BiocFileCache_1.14.0   R6_2.5.0              
##  [34] flexmix_2.3-17         spatstat.utils_2.1-0   bitops_1.0-6          
##  [37] cachem_1.0.4           DelayedArray_0.16.2    assertthat_0.2.1      
##  [40] promises_1.2.0.1       scales_1.1.1           multcomp_1.4-16       
##  [43] nnet_7.3-15            gtable_0.3.0           globals_0.14.0        
##  [46] goftest_1.2-2          sandwich_3.0-0         rlang_0.4.10          
##  [49] systemfonts_1.0.1      splines_4.0.3          rstatix_0.7.0         
##  [52] lazyeval_0.2.2         broom_0.7.5            yaml_2.2.1            
##  [55] abind_1.4-5            backports_1.2.1        httpuv_1.5.5          
##  [58] tools_4.0.3            ellipsis_0.3.1         jquerylib_0.1.3       
##  [61] ggridges_0.5.3         TFisher_0.2.0          Rcpp_1.0.6            
##  [64] plyr_1.8.6             progress_1.2.2         zlibbioc_1.36.0       
##  [67] RCurl_1.98-1.2         prettyunits_1.1.1      deldir_0.2-10         
##  [70] rpart_4.1-15           openssl_1.4.3          pbapply_1.4-3         
##  [73] zoo_1.8-9              ggrepel_0.9.1          haven_2.3.1           
##  [76] cluster_2.1.1          scattermore_0.7        data.table_1.14.0     
##  [79] openxlsx_4.2.3         lmtest_0.9-38          RANN_2.6.1            
##  [82] tmvnsim_1.0-2          mvtnorm_1.1-1          fitdistrplus_1.1-3    
##  [85] patchwork_1.1.1        xtable_1.8-4           mime_0.10             
##  [88] hms_1.0.0              evaluate_0.14          XML_3.99-0.5          
##  [91] rio_0.5.26             readxl_1.3.1           gridExtra_2.3         
##  [94] compiler_4.0.3         KernSmooth_2.23-18     crayon_1.4.1          
##  [97] htmltools_0.5.1.1      mgcv_1.8-34            later_1.1.0.1         
## [100] Formula_1.2-4          DBI_1.1.1              dbplyr_2.1.0          
## [103] MASS_7.3-53.1          rappdirs_0.3.3         Matrix_1.3-2          
## [106] car_3.0-10             rbibutils_2.0          igraph_1.2.6          
## [109] forcats_0.5.1          pkgconfig_2.0.3        sn_1.6-2              
## [112] numDeriv_2016.8-1.1    foreign_0.8-81         plotly_4.9.3          
## [115] xml2_1.3.2             svglite_2.0.0          bslib_0.2.4           
## [118] webshot_0.5.2          XVector_0.30.0         rvest_1.0.0           
## [121] digest_0.6.27          sctransform_0.3.2      RcppAnnoy_0.0.18      
## [124] spatstat.data_2.0-0    leiden_0.3.7           cellranger_1.1.0      
## [127] fastmatch_1.1-0        uwot_0.1.10            curl_4.3              
## [130] shiny_1.6.0            modeltools_0.2-23      nlme_3.1-152          
## [133] lifecycle_1.0.0        jsonlite_1.7.2         carData_3.0-4         
## [136] askpass_1.1            fansi_0.4.2            pillar_1.5.1          
## [139] lattice_0.20-41        fastmap_1.1.0          httr_1.4.2            
## [142] plotrix_3.8-1          survival_3.2-9         glue_1.4.2            
## [145] spatstat_1.64-1        zip_2.1.1              png_0.1-7             
## [148] bit_4.0.4              stringi_1.5.3          sass_0.3.1            
## [151] blob_1.2.1             memoise_2.0.0          mathjaxr_1.4-0        
## [154] irlba_2.3.3            future.apply_1.7.0
#  library(reticulate)
#  use_condaenv("base")
#  use_python("/Users/lee.marshall/opt/anaconda3/bin/python", required = TRUE)
#  display python config
#  print(py_config())
#  display module search path
#  sys <- import("sys")
#  sys$path
#  py_module_available("scanpy")

GEO

GSE129788 fastq files only contain single forward read
GEO advised me to download bams from sra run selector - run - data access
Used network pc to download directly to hpc
bamtofastq to recreate fastq files
merge fastq files per sample prior to cellranger
cellranger-6.0.1 to align and count
performed mRNA alignment for standard single cell counts
also performed mRNA and premRNA (introns) counts for trajectory analysis

#--------- bamtofastq
qsubs.sh -n bamtofastq_cellranger -c 10 -a names.txt -s "bamtofastq --nthreads=10 --reads-per-fastq=1000000000 SAMPLE.bam SAMPLE_fastq"

#--------- Merge fastq files
qsubs.sh -a names.txt -n fastq_merge -s "cat SAMPLE_fastq/*/*R1_001.fastq.gz > SAMPLE_S1_L001_R1_001.fastq.gz"

qsubs.sh -a names.txt -n fastq_merge -s "cat SAMPLE_fastq/*/*R2_001.fastq.gz > SAMPLE_S1_L001_R2_001.fastq.gz"

qsubs.sh -a names.txt -n fastq_merge -s "cat SAMPLE_fastq/*/*I1_001.fastq.gz > SAMPLE_S1_L001_I1_001.fastq.gz"
#---------- cellranger_mRNA
module load cellranger/cellranger-6.0.1
qsubs.sh -n cellranger_count -c 4 -w 48 -a names.txt -s "cellranger count --sample=SAMPLE --id=SAMPLE_count --localcores=4 --fastqs=./fastq --chemistry=SC3Pv2 --transcriptome=/home/lee.marshall/bras_primary/software/cellranger_v6/refdata-gex-mm10-2020-A"

#---------- cellranger_mRNA_premRNA
module load cellranger/cellranger-6.0.1
qsubs.sh -n cellranger_count -c 4 -w 48 -a names.txt -s "cellranger count --sample=SAMPLE --id=SAMPLE_count --localcores=4 --fastqs=./fastq --chemistry=SC3Pv2 --include-introns  --transcriptome=/home/lee.marshall/bras_primary/software/cellranger_v6/refdata-gex-mm10-2020-A"

Seurat QC

Quality control graphs used to identify filtering thresholds
Senescence Cell not selectively removed

#---------- Seurat_Object
#---- Counts
dir <- "cellranger_mRNA/"
name <- list.files(path = "cellranger_mRNA") %>% 
             gsub(pattern = "_count", replacement = "")

#---- Count Matrix
seurat_counts <- mclapply(name, function(i){
  count <- Read10X(data.dir=paste0(dir,i,"_count/outs/filtered_feature_bc_matrix/"),
                  gene.column=2, # col1 Ensemble, col2 Symbol
                  unique.features=TRUE,
                  strip.suffix=TRUE) 
  seurat_obj <- CreateSeuratObject(counts=count, 
                                   project=i,
                                   min.cells=10,
                                   min.genes=200)
})
#---- Merge Seurat Objects
seurat_combined <- merge(x = seurat_counts[[1]], 
                               y = seurat_counts[2:length(seurat_counts)],
                               add.cell.ids = unlist(name), 
                               project="Aging_Mice_GSE129788")

#---------- Add Metadata
# View(seurat_combined@meta.data)
# orig.ident: contains the sample identity if known
# nCount_RNA: number of UMIs per cell
# nFeature_RNA: number of genes detected per cell
#---- Create metadata dataframe
metadata <- seurat_combined@meta.data
#---- Add cell IDs to metadata
metadata$cells <- rownames(metadata)
#---- Rename columns
names(metadata)[1:3] <- c("name","nUMI","nGene")
#---- Add number of genes per UMI for each cell to metadata
metadata$log10GenesPerUMI <- log10(metadata$nGene) / log10(metadata$nUMI)
#---- Compute percent mito ratio
metadata <- cbind(metadata,
                  PercentageFeatureSet(object = seurat_combined, pattern = "^mt-") / 100)
names(metadata)[6] <- "mitoRatio"
#---- Create Sample
metadata <- metadata %>% 
  left_join(data.frame(name = unique(metadata$name),
                       sample = c("Old1","Old2","Old3","Old4","Old5","Old6","Old7","Old8",
                                  "Young1","Young2","Young3","Young4","Young5","Young6",
                                  "Young7","Young8"),
                       group = rep(c("Old","Young"),each=8)), by = "name")
#---- Add metadata back to Seurat object
rownames(metadata) <- metadata$cells
seurat_combined@meta.data <- metadata
#---- Senescence Markers, p16 = Cdkn2a, p21 = Cdkn1a
seurat_combined$Cdkn2a_Count <- 
  seurat_combined@assays$RNA@counts[rownames(seurat_combined) == "Cdkn2a",]
seurat_combined$Cdkn1a_Count <- 
  seurat_combined@assays$RNA@counts[rownames(seurat_combined) == "Cdkn1a",]
#---- rm data
rm(seurat_counts, metadata)
#---- save rds
saveRDS(seurat_combined, file="seurat_results/seurat_combined.rds")
#---------- Cell_Counts
#----- Load data
seurat_combined <- readRDS(file = "seurat_results/seurat_combined.rds")
#----- Visualize the number of cell counts per sample
seurat_combined@meta.data %>% 
    ggplot(aes(x=sample, fill=sample)) + 
    geom_bar() + 
    ggtitle("Number of Cells") + 
    theme_classic() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
    theme(plot.title = element_text(hjust=0.5, face="bold"))

#---------- UMI_Counts_per_cell (transcripts)
#----- Visualize the number UMIs/transcripts per cell
seurat_combined@meta.data %>% 
    ggplot(aes(color=sample, x=nUMI, fill= sample)) + 
    geom_density(alpha = 0.2) + 
    geom_vline(xintercept = 500, linetype="dashed", colour = "red3") + 
    geom_vline(xintercept = 10000, linetype="dashed", colour = "red3") + 
    ggtitle("Number of UMIs/Transcripts per Cell") + 
    ylab("Cell density") + 
    scale_x_log10() + 
    theme_classic() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
    theme(plot.title = element_text(hjust=0.5, face="bold"))

#---------- Genes_per_cell
#----- Visualize the distribution of genes detected per cell via histogram
p1 <- seurat_combined@meta.data %>% 
    ggplot(aes(color=sample, x=nGene, fill= sample)) + 
    geom_density(alpha = 0.2) + 
    geom_vline(xintercept = 250, linetype="dashed", colour = "red3") + 
    geom_vline(xintercept = 5000, linetype="dashed", colour = "red3") + 
    ggtitle("Number of Genes per Cell") + 
    scale_x_log10() + 
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
    theme(plot.title = element_text(hjust=0.5, face="bold"))

#----- Visualize the distribution of genes detected per cell via boxplot
p2 <- seurat_combined@meta.data %>% 
    ggplot(aes(x=sample, y=log10(nGene), fill=sample)) + 
    geom_boxplot() + 
    ggtitle("Number of Genes vs Number of Cells") +   
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.title = element_text(hjust=0.5, face="bold"))
#----- Plots
cowplot::plot_grid(p1, p2, ncol=2)

#---------- UMI_Counts_vs_Genes
#----- Visualize the correlation between genes detected and number of UMIs/transcripts 
# determine whether strong presence of cells with low numbers of genes/UMIs
seurat_combined@meta.data %>% 
    ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) + 
    geom_point(size=0.1) + 
    geom_vline(xintercept = 500, linetype="dashed", colour = "red3") + 
    geom_vline(xintercept = 10000, linetype="dashed", colour = "red3") + 
    geom_hline(yintercept = 250, linetype="dashed", colour = "red3") + 
    geom_hline(yintercept = 5000, linetype="dashed", colour = "red3") + 
    ggtitle("Number of UMIs/Transcripts vs Number of Cells") +   
    stat_smooth(method=lm) + 
    scale_x_log10() + 
    scale_y_log10() + 
    theme_classic() + 
    scale_colour_viridis_c() +
    facet_wrap(~sample)

#---------- Mitochondrial_counts_ratio
#----- Visualize the distribution of mitochondrial gene expression detected per cell
seurat_combined@meta.data %>% 
    ggplot(aes(color=sample, x=mitoRatio, fill=sample)) + 
    geom_density(alpha = 0.2) + 
    geom_vline(xintercept = 0.20, linetype="dashed", colour = "red3") + 
    ggtitle("Mitochondrial Counts Ratio") + 
    ylab("Cell density") + 
    scale_x_log10() + 
    theme_classic()

#---------- Complexity
#----- Visualize the overall complexity of the gene expression by visualizing 
# the genes detected per UMI
seurat_combined@meta.data %>%
    ggplot(aes(x=log10GenesPerUMI, color = sample, fill=sample)) + 
    geom_density(alpha = 0.2) +
    geom_vline(xintercept = 0.85, linetype="dashed", colour = "red3") + 
    ggtitle("Complexity Genes per UMI") + 
    ylab("Cell density") + 
    theme_classic() 

#---------- Senescence Markers 
#----- Senescence marker vs Number of UMIs/Transcripts
seurat_combined@meta.data %>% 
  ggplot(aes(x=nUMI, y=Cdkn2a_Count, color=sample)) + 
  geom_point(size=0.5) +
  geom_vline(xintercept = 500, linetype="dashed", colour = "red3") + 
  geom_vline(xintercept = 10000, linetype="dashed", colour = "red3") +
  ggtitle("Cdkn2a Counts vs Number of UMIs/Transcripts") + 
  scale_x_log10() + 
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
  theme(plot.title = element_text(hjust=0.5, face="bold")) + 
  facet_wrap(~sample)

seurat_combined@meta.data %>% 
  ggplot(aes(x=nUMI, y=Cdkn1a_Count, color=sample)) + 
  geom_point(size=0.5) +
  geom_vline(xintercept = 500, linetype="dashed", colour = "red3") + 
  geom_vline(xintercept = 10000, linetype="dashed", colour = "red3") +
  ggtitle("Cdkn1a Counts vs Number of UMIs/Transcripts") + 
  scale_x_log10() + 
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
  theme(plot.title = element_text(hjust=0.5, face="bold")) + 
  facet_wrap(~sample)

#----- Senescence marker vs Number of Cells
seurat_combined@meta.data %>% 
  ggplot(aes(x=nGene, y=Cdkn2a_Count, color=sample)) + 
  geom_point(size=0.5) +
  geom_vline(xintercept = 250, linetype="dashed", colour = "red3") + 
  geom_vline(xintercept = 5000, linetype="dashed", colour = "red3") +
  ggtitle("Cdkn2a Counts vs Number of Cells") + 
  scale_x_log10() + 
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
  theme(plot.title = element_text(hjust=0.5, face="bold")) + 
  facet_wrap(~sample)

seurat_combined@meta.data %>% 
  ggplot(aes(x=nGene, y=Cdkn1a_Count, color=sample)) + 
  geom_point(size=0.5) +
  geom_vline(xintercept = 250, linetype="dashed", colour = "red3") + 
  geom_vline(xintercept = 5000, linetype="dashed", colour = "red3") +
  ggtitle("Cdkn1a Counts vs Number of Cells") + 
  scale_x_log10() + 
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
  theme(plot.title = element_text(hjust=0.5, face="bold")) + 
  facet_wrap(~sample)

#----- Senescence marker vs mitoRatio
seurat_combined@meta.data %>% 
  ggplot(aes(x=mitoRatio, y=Cdkn2a_Count, color=sample)) + 
  geom_point(size=0.5) +
  geom_vline(xintercept = 0.20, linetype="dashed", colour = "red3") + 
  ggtitle("Cdkn2a Counts vs Mitochondrial Counts Ratio") + 
  scale_x_log10() + 
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
  theme(plot.title = element_text(hjust=0.5, face="bold")) + 
  facet_wrap(~sample)

seurat_combined@meta.data %>% 
  ggplot(aes(x=mitoRatio, y=Cdkn1a_Count, color=sample)) + 
  geom_point(size=0.5) +
  geom_vline(xintercept = 0.20, linetype="dashed", colour = "red3") + 
  ggtitle("Cdkn1a Counts vs Mitochondrial Counts Ratio") + 
  scale_x_log10() + 
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
  theme(plot.title = element_text(hjust=0.5, face="bold")) + 
  facet_wrap(~sample)

#----- Senescence marker vs Complexity Genes per UMI
seurat_combined@meta.data %>% 
  ggplot(aes(x=log10GenesPerUMI, y=Cdkn2a_Count, color=sample)) + 
  geom_point(size=0.5) +
  geom_vline(xintercept = 0.85, linetype="dashed", colour = "red3") + 
  ggtitle("Cdkn2a Counts vs Complexity Genes per UMI") + 
  scale_x_log10() + 
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
  theme(plot.title = element_text(hjust=0.5, face="bold")) + 
  facet_wrap(~sample)

seurat_combined@meta.data %>% 
  ggplot(aes(x=log10GenesPerUMI, y=Cdkn1a_Count, color=sample)) + 
  geom_point(size=0.5) +
  geom_vline(xintercept = 0.85, linetype="dashed", colour = "red3") + 
  ggtitle("Cdkn1a Counts vs Complexity Genes per UMI") + 
  scale_x_log10() + 
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
  theme(plot.title = element_text(hjust=0.5, face="bold")) + 
  facet_wrap(~sample)

Seurat Filtering

Remove poor quilty cells and doublets
Keep high quality cells without removing biologically relevant cells
* nUMI >= 500
* nUMI <= 10000
* nGene >= 250
* nGene <= 5000
* mitoRatio < 0.20
* log10GenesPerUMI > 0.85

#---------- Seurat_Filtering
# Filter out low quality reads using selected thresholds - these will change with experiment
seurat_filtered <- subset(x = seurat_combined, 
                          subset = (nUMI >= 500) & 
                                   (nUMI <= 10000) & 
                                   (nGene >= 250) & 
                                   (nGene <= 5000) &
                                   (mitoRatio < 0.20) & 
                                   (log10GenesPerUMI > 0.85))
# seurat_filtered <- seurat_combined@meta.data %>% 
#                      filter(nUMI >= 500 & 
#                             nUMI <= 20000 & 
#                             nGene >= 250 & 
#                             nGene <= 10000 & 
#                             mitoRatio < 0.20 & 
#                             log10GenesPerUMI > 0.85)
# seurat_filtered <- subset(seurat_combined, cells = seurat_filtered$cells)
#---- save rds
saveRDS(seurat_filtered, file="seurat_results/seurat_filtered.rds")
#---------- Cell_Counts_Pre_and_Post_Filtered 
#----- Load data
seurat_filtered <- readRDS(file = "seurat_results/seurat_filtered.rds")
#----- Number of Cells
cbind(seurat_combined@meta.data$sample %>% 
        table() %>% 
        as.data.frame() %>% 
        dplyr::rename(Sample="."), 
      seurat_filtered@meta.data$sample %>% 
        table() %>% 
        as.data.frame() %>% 
        dplyr::rename(Sample=".")) %>% 
  kable(caption="Number of Cells", format.args = list(big.mark = ",")) %>% 
  kable_styling(c("striped", "bordered")) %>% 
  add_header_above(c("Pre-Filtered" = 2, "Post-Filtered" = 2))
Number of Cells
Pre-Filtered
Post-Filtered
Sample Freq Sample Freq
Old1 2,826 Old1 2,174
Old2 2,264 Old2 1,799
Old3 6,437 Old3 5,154
Old4 6,035 Old4 4,668
Old5 3,293 Old5 2,639
Old6 3,395 Old6 2,607
Old7 3,662 Old7 3,157
Old8 4,114 Old8 3,606
Young1 2,433 Young1 1,546
Young2 3,049 Young2 2,082
Young3 3,067 Young3 2,305
Young4 4,077 Young4 2,623
Young5 3,233 Young5 2,276
Young6 3,547 Young6 2,235
Young7 5,226 Young7 2,872
Young8 6,405 Young8 2,891
#---------- Violin_plots
cowplot::plot_grid(VlnPlot(object = seurat_combined, 
                           features = c("nGene", "nUMI", "mitoRatio", "log10GenesPerUMI"), 
                           ncol = 1, 
                           pt.size=0, 
                           group.by="sample"), 
                   VlnPlot(object = seurat_filtered, 
                           features = c("nGene", "nUMI", "mitoRatio", "log10GenesPerUMI"), 
                           ncol = 1, 
                           pt.size=0, 
                           group.by="sample"), 
                   labels=c("Pre-Filtered", "Post-Filtered"), ncol = 2)

#---------- Scatter_plots
cowplot::plot_grid(FeatureScatter(object = seurat_combined, 
                                  feature1 = "nGene", 
                                  feature2 = "nUMI", 
                                  pt.size=0.01, 
                                  group.by="sample"), 
                   FeatureScatter(object = seurat_filtered, 
                                  feature1 = "nGene", 
                                  feature2 = "nUMI", 
                                  pt.size=0.01, 
                                  group.by="sample"),
                   FeatureScatter(object = seurat_combined, 
                                  feature1 = "nGene", 
                                  feature2 = "mitoRatio", 
                                  pt.size=0.01, 
                                  group.by="sample"), 
                   FeatureScatter(object = seurat_filtered, 
                                  feature1 = "nGene", 
                                  feature2 = "mitoRatio", 
                                  pt.size=0.01, 
                                  group.by="sample"), 
                   labels=c("Pre-Filtered", "Post-Filtered"), nrow=2, ncol=2)

#---- rm data
rm(seurat_combined)

Seurat Cell Cycle Scoring

Determine if cell cycle regression is required

#---------- Cell_Cycle_Scoring
#----- Normalize the counts
seurat_phase <- NormalizeData(seurat_filtered)
#----- Load cell cycle markers
load(file="seurat_results/seurat_cycle.rda")
#----- Convert human to mouse genes
Human2mouseGenes <- function(x){
   require("biomaRt")
   human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
   mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
   genesV2 = getLDS(attributes = c("hgnc_symbol"), 
                    filters = "hgnc_symbol", 
                    values = x , 
                    mart = human, 
                    attributesL = c("mgi_symbol"), 
                    martL = mouse, 
                    uniqueRows=T)
   converted <- unique(genesV2[, 2])
   return(converted)
}
m_g2m_genes <- Human2mouseGenes(g2m_genes)
m_s_genes <- Human2mouseGenes(s_genes)
#----- Identify the most variable genescbin
seurat_phase <- FindVariableFeatures(seurat_phase, 
                     selection.method = "vst",
                     nfeatures = 2000, 
                     verbose = FALSE)
#----- Scale the counts
seurat_phase <- ScaleData(seurat_phase)
#----- Perform PCA
seurat_phase <- RunPCA(seurat_phase)
#----- Score cells for cell cycle
seurat_phase <- CellCycleScoring(seurat_phase, 
                                 g2m.features = m_g2m_genes, 
                                 s.features = m_s_genes)
#---- save rds
saveRDS(seurat_phase, file="seurat_results/seurat_phase.rds")
#---------- PCA
#----- Load data
seurat_phase <- readRDS(file = "seurat_results/seurat_phase.rds")
#----- Plot the PCA colored by cell cycle phase
DimPlot(seurat_phase,
        reduction = "pca",
        group.by= "Phase",
        split.by = "Phase")

# you can regress out the s and g2m score
# alternatively you could only regress out the s-g2m score to keep 
# non-cyling and cycling cell differences
DimPlot(seurat_phase,
        reduction = "pca",
        group.by= "Phase",
        split.by = "group")

#----- Number of Cell Cycle Phase Cells
seurat_phase@meta.data %>% 
  dplyr::group_by(group) %>% 
  dplyr::count(Phase) %>%
  kable(caption="Number of Cell Cycle Phase Cells", format.args = list(big.mark = ",")) %>% 
  kable_styling(c("striped", "bordered"))
Number of Cell Cycle Phase Cells
group Phase n
Old G1 8,878
Old G2M 8,379
Old S 8,547
Young G1 6,646
Young G2M 6,062
Young S 6,122
#----- Remove data
rm(seurat_phase)

Seurat Batch Correction

Standard batch correction is not considered standard anymore
SCTransform batch correction allows for better normalization
SCTransform method models the UMI counts using a regularized negative binomial
model to remove the variation due to sequencing depth (total nUMIs per cell),
while adjusting the variance based on pooling information across genes with
similar abundances (similar to some bulk RNA-seq methods).

If seurat_integrated has a memory error, run on the HPC
conda env create –file r3.6_seurat.yml
name: r3.6_seurat
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- conda-forge::r-essentials=3.6
- conda-forge::libblas=3.8.0=14_mkl
- openblas
- r-doparallel
- conda-forge::r-seurat=3.2.3
conda activate r3.6_seurat

#---------- Batch_Correction_SCTransform
# The data is preprocessed by splitting into individual ‘assays’. 
# In this experiment, each individual sample is considered an ‘assay’.
# Ensure the dataset used here is the non-normalized dataset
DefaultAssay(seurat_filtered) <- "RNA"

#---------- Normalizing the data
# global-scaling normalization method “LogNormalize” that normalizes the feature expression 
# measurements for each cell by the total expression, multiplies this by a scale factor 
# (10,000 by default), and log-transforms the result. 
# Normalized values are stored in seurat_obj[["RNA"]]@data.
# identical(seurat_norm@assays$RNA@counts, seurat_norm@assays$RNA@data)
# Split seurat object by condition to perform cell cycle scoring and SCT
seurat_split <- SplitObject(seurat_filtered, split.by = "sample")
for (i in 1:length(seurat_split)) {
    seurat_split[[i]] <- NormalizeData(seurat_split[[i]], 
                                       verbose = TRUE)
    seurat_split[[i]] <- CellCycleScoring(seurat_split[[i]], 
                                          g2m.features=m_g2m_genes, 
                                          s.features=m_s_genes)
    seurat_split[[i]] <- SCTransform(seurat_split[[i]], 
                                     vars.to.regress = c("mitoRatio"))
}
#---------- Identification of highly variable features
# calculate a subset of features that exhibit high cell-to-cell variation in the dataset 
# (i.e, they are highly expressed in some cells, and lowly expressed in others)
# genes names stored in seurat_norm@assays[["RNA"]]@var.features
# variable features stored in seurat_norm@assays[["RNA"]]@meta.features
#----- Select the most variable features to use for integration
seurat_features <- SelectIntegrationFeatures(object.list = seurat_split, 
                                            nfeatures = 2000) 
#----- Prepare the SCT list object for integration
seurat_split <- PrepSCTIntegration(object.list = seurat_split, 
                                   anchor.features = seurat_features)
#----- modified to use young samples as the reference
# first time used reference=c(9,10,11,12,13,14,15,16)
seurat_anchors <- FindIntegrationAnchors(object.list = seurat_split, 
                                        normalization.method = "SCT", 
                                        anchor.features = seurat_features)
#---- Integrate across conditions
seurat_integrated <- IntegrateData(anchorset = seurat_anchors, 
                                   normalization.method = "SCT")
#---- Run PCA
seurat_integrated <- RunPCA(object = seurat_integrated)
ElbowPlot(object = seurat_integrated, 
          ndims = 50)
#---- Run UMAP
seurat_integrated <- RunUMAP(seurat_integrated, 
                             dims = 1:40,
                                   reduction = "pca")
#---- save rds
saveRDS(seurat_integrated, file="seurat_results/seurat_integrated.rds")
#---- Remove data
rm(seurat_filtered,seurat_split,seurat_features,seurat_anchors)
#----------  Plot PCA
#----- Load data
seurat_integrated <- readRDS(file = "seurat_results/seurat_integrated.rds")
#----- PCA
PCAPlot(seurat_integrated,
        group.by = "sample",
        split.by = "group")

#----- Explore heatmap of PCs
DimHeatmap(seurat_integrated, 
           dims = 1:9, 
           cells = 500, 
           balanced = TRUE)

#----- Printing out the most variable genes driving PCs
print(x = seurat_integrated[["pca"]], 
      dims = 1:9, 
      nfeatures = 5)
## PC_ 1 
## Positive:  C1qa, C1qb, C1qc, Cst3, Ctss 
## Negative:  Plp1, Ptgds, Car2, Cnp, Cryab 
## PC_ 2 
## Positive:  Plp1, Ptgds, Trf, Mbp, Cnp 
## Negative:  Aldoc, Clu, Atp1a2, Apoe, Gpr37l1 
## PC_ 3 
## Positive:  Cldn5, Bsg, Ly6c1, Ly6a, Flt1 
## Negative:  Meg3, Snhg11, Atp1b1, Pcp4, Snap25 
## PC_ 4 
## Positive:  Apoe, Aldoc, Cst3, Clu, Mt1 
## Negative:  Meg3, Snhg11, Atp1b1, Snap25, Bsg 
## PC_ 5 
## Positive:  Atp1b1, Mt1, Gad1, Aldoc, Clu 
## Negative:  C1ql1, Pdgfra, Olig1, Matn4, Tnr 
## PC_ 6 
## Positive:  Npy, Vtn, Fabp7, Gad1, Frzb 
## Negative:  Rtn1, Ly6h, Chn1, Cck, Thy1 
## PC_ 7 
## Positive:  Gad1, Meis2, Shisa8, Synpr, Cspg5 
## Negative:  Vtn, Apod, Chn1, Ptn, Uchl1 
## PC_ 8 
## Positive:  Cd52, Apoe, Fxyd5, Rpl32, Lyz2 
## Negative:  Vtn, Higd1b, Myl9, Ndufa4l2, Rgs5 
## PC_ 9 
## Positive:  Nbl1, Mgp, Cald1, Higd1b, Ndufa4l2 
## Negative:  Apod, Fabp7, Npy, Frzb, Cldn5
#----- Plot the elbow plot
ElbowPlot(object = seurat_integrated, 
          ndims = 50)

#---------- Plot UMAP                             
DimPlot(seurat_integrated, 
        reduction = "umap", 
        split.by = "group")

Seurat Clustering_Cells

Seurat uses a graph-based clustering approach, which embeds cells in a graph
structure, using a K-nearest neighbor (KNN) graph (by default), with edges
drawn between cells with similar gene expression patterns.

The resolution is an important argument that sets the “granularity” of the
downstream clustering and will need to be optimized for every individual
experiment. For datasets of 3,000 - 5,000 cells, the resolution set between
0.4-1.4 generally yields good clustering. Increased resolution values lead to a
greater number of clusters, which is often required for larger datasets.

#---------- Clustering_Cells
#----- Determine the K-nearest neighbor graph
seurat_cluster <- FindNeighbors(object = seurat_integrated, 
                                dims = 1:40)
#----- Determine the clusters for various resolutions                                
seurat_cluster <- FindClusters(object = seurat_cluster,
                               resolution = c(0.2, 0.4, 0.6, 0.8, 1.0, 1.2))
#---- save rds
saveRDS(seurat_cluster, file="seurat_results/seurat_cluster.rds")
#----- Remove data
rm(seurat_integrated)
#----- Load data
seurat_cluster <- readRDS(file = "seurat_results/seurat_cluster.rds")
#---------- Explore resolutions
#----- Assign identity of clusters
Idents(object = seurat_cluster) <- "integrated_snn_res.0.4"
#----- Plot UMAP                             
DimPlot(seurat_cluster, 
        reduction = "umap", 
        label = TRUE)
UMAP_0.4

UMAP_0.4

Seurat UMAP_Summary

Resolution 0.4

#----- Extract identity and sample information from seurat object to 
# determine the number of cells per cluster per sample
FetchData(seurat_cluster, vars = c("ident", "sample")) %>% 
  dplyr::count(ident, sample) %>% 
  tidyr::spread(ident, n) %>% 
  kable(caption="Number of Cells", 
        format.args = list(big.mark = ",")) %>% 
  kable_styling(c("striped", "bordered"))
Number of Cells
sample 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
Old1 270 179 196 181 143 130 197 78 109 187 93 86 45 60 47 16 8 20 7 36 19 18 17 13 19
Old2 176 142 142 266 79 164 144 37 93 127 83 67 62 50 29 17 4 14 8 17 7 29 18 14 10
Old3 652 522 560 276 410 560 289 294 286 160 247 159 168 85 84 48 58 30 29 50 50 38 34 32 33
Old4 729 447 424 273 425 309 292 258 250 217 201 159 135 74 73 54 72 39 28 42 30 40 29 41 27
Old5 354 252 185 275 207 248 141 109 150 90 112 56 72 33 46 57 65 29 20 40 22 20 19 14 23
Old6 353 247 195 364 169 176 134 133 107 97 163 66 60 46 33 53 45 33 34 17 17 11 17 21 16
Old7 419 337 293 262 270 265 212 175 168 57 173 87 62 80 35 20 36 27 12 17 43 22 39 22 24
Old8 558 396 318 306 317 251 208 193 202 61 246 89 43 51 46 26 32 31 18 35 60 31 29 35 24
Young1 266 143 82 185 132 132 89 80 39 36 108 27 14 26 16 22 39 20 14 18 13 9 6 24 6
Young2 263 204 228 106 142 124 103 139 147 170 84 67 18 22 26 15 4 24 45 27 49 29 20 8 18
Young3 221 198 257 160 121 156 136 152 161 182 101 110 55 43 32 26 21 32 33 25 17 21 16 8 21
Young4 361 259 258 166 209 150 157 183 159 113 130 101 52 49 29 16 6 36 41 27 31 21 17 22 30
Young5 237 209 179 189 138 153 124 162 149 189 83 104 68 36 48 23 20 22 38 18 14 31 15 9 18
Young6 261 160 189 206 142 142 116 159 132 204 129 93 34 27 41 20 13 24 41 14 18 13 19 32 6
Young7 329 321 289 169 211 167 197 224 198 178 72 145 66 42 50 21 15 30 30 26 15 18 29 14 16
Young8 369 315 347 125 174 136 203 194 201 178 44 142 108 66 71 20 14 39 36 20 19 29 19 8 14
#----- UMAP of cells in each cluster by sample
DimPlot(seurat_cluster, 
        label = TRUE, 
        split.by = "group") + NoLegend()

#----- plot
VlnPlot(object = seurat_cluster, 
        features = c("nGene", "nUMI", "log10GenesPerUMI", "mitoRatio"), 
        ncol = 2, 
        pt.size=0, 
        group.by="integrated_snn_res.0.4") 

FeaturePlot(seurat_cluster, 
            reduction = "umap", 
            features = c("nUMI", "nGene", "log10GenesPerUMI", "mitoRatio"),
            ncol = 2, 
            pt.size = 0.4, 
            sort.cell = TRUE,
            min.cutoff = 'q10',
            label = TRUE)

#----- cell phases
FetchData(seurat_cluster, vars = c("ident", "Phase")) %>% 
  dplyr::count(ident, Phase) %>% 
  tidyr::spread(ident, n) %>% 
  kable(caption="Number of Cells", format.args = list(big.mark = ",")) %>% 
  kable_styling(c("striped", "bordered"))
Number of Cells
Phase 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
G1 1,320 1,764 1,825 1,540 504 905 964 995 1,163 1,086 1,105 560 258 354 394 238 175 173 78 71 106 138 127 216 34
G2M 1,818 1,132 1,191 939 1,425 1,143 889 616 654 709 579 627 539 237 175 110 106 161 228 226 136 165 109 58 238
S 2,680 1,435 1,126 1,030 1,360 1,215 889 959 734 451 385 371 265 199 137 106 171 116 128 132 182 77 107 43 33
#----- Explore whether clusters segregate by cell cycle phase
DimPlot(seurat_cluster,
        label = TRUE, 
        split.by = "Phase")  + NoLegend()

#----- plot
FeaturePlot(seurat_cluster, 
            reduction = "umap", 
            features = c("S.Score", "G2M.Score"),
            ncol = 2,
            pt.size = 0.4, 
            sort.cell = TRUE,
            min.cutoff = 'q10',
            label = TRUE)

#----- Defining the information in the seurat object of interest
# Extracting this data from the seurat object
umap_data <- FetchData(seurat_cluster, 
                     vars = c(paste0("PC_", 1:16), 
                              "ident", "UMAP_1", "UMAP_2"))
#----- Extract the UMAP coordinates for the first 10 cells
#seurat_cluster@reductions$umap@cell.embeddings[1:10, 1:2]
# Adding cluster label to center of cluster on UMAP
umap_label <- FetchData(seurat_cluster, 
                        vars = c("ident", "UMAP_1", "UMAP_2"))  %>%
               group_by(ident) %>%
               summarise(x=mean(UMAP_1), y=mean(UMAP_2))
#----- Plotting a UMAP plot for each of the PCs
map(paste0("PC_", 1:16), function(pc){
        ggplot(umap_data, 
               aes(UMAP_1, UMAP_2)) +
                geom_point(aes_string(color=pc), 
                           alpha = 0.7) +
                scale_color_gradient(guide = FALSE, 
                                     low = "grey90", 
                                     high = "blue")  +
                geom_text(data=umap_label, 
                          aes(label=ident, x, y)) +
                ggtitle(pc)
}) %>% 
        plot_grid(plotlist = .)

#----- Explore heatmap of PCs
DimHeatmap(seurat_cluster, 
           dims = 1:9, 
           cells = 500, 
           balanced = TRUE)

#----- Printing out the most variable genes driving PCs
print(x = seurat_cluster[["pca"]], 
      dims = 1:9, 
      nfeatures = 5)
## PC_ 1 
## Positive:  C1qa, C1qb, C1qc, Cst3, Ctss 
## Negative:  Plp1, Ptgds, Car2, Cnp, Cryab 
## PC_ 2 
## Positive:  Plp1, Ptgds, Trf, Mbp, Cnp 
## Negative:  Aldoc, Clu, Atp1a2, Apoe, Gpr37l1 
## PC_ 3 
## Positive:  Cldn5, Bsg, Ly6c1, Ly6a, Flt1 
## Negative:  Meg3, Snhg11, Atp1b1, Pcp4, Snap25 
## PC_ 4 
## Positive:  Apoe, Aldoc, Cst3, Clu, Mt1 
## Negative:  Meg3, Snhg11, Atp1b1, Snap25, Bsg 
## PC_ 5 
## Positive:  Atp1b1, Mt1, Gad1, Aldoc, Clu 
## Negative:  C1ql1, Pdgfra, Olig1, Matn4, Tnr 
## PC_ 6 
## Positive:  Npy, Vtn, Fabp7, Gad1, Frzb 
## Negative:  Rtn1, Ly6h, Chn1, Cck, Thy1 
## PC_ 7 
## Positive:  Gad1, Meis2, Shisa8, Synpr, Cspg5 
## Negative:  Vtn, Apod, Chn1, Ptn, Uchl1 
## PC_ 8 
## Positive:  Cd52, Apoe, Fxyd5, Rpl32, Lyz2 
## Negative:  Vtn, Higd1b, Myl9, Ndufa4l2, Rgs5 
## PC_ 9 
## Positive:  Nbl1, Mgp, Cald1, Higd1b, Ndufa4l2 
## Negative:  Apod, Fabp7, Npy, Frzb, Cldn5

Seurat Marker_Identification

#---------- Marker_Identification
#----- Select the RNA counts slot to be the default assay
DefaultAssay(seurat_cluster) <- "RNA"
#----- Normalize RNA data for visualization purposes
seurat_markers <- NormalizeData(seurat_cluster, verbose = FALSE)
#---- save rds
saveRDS(seurat_markers, file="seurat_results/seurat_markers.rds")
#---------- Marker_Identification
#----- Load data
seurat_markers <- readRDS(file = "seurat_results/seurat_markers.rds")
#----- remove the x-axis text and tick
# plot.margin to adjust the white space between each plot.
# ... pass any arguments to VlnPlot in Seurat
vlnplot_gg <- function(obj, 
                          feature, 
                          pt.size = 0, 
                          plot.margin = unit(c(-0.75, 0, -0.75, 0), "cm"),
                          ...){
  p <- VlnPlot(obj, features = feature, pt.size = pt.size, ... ) + 
    xlab("") + 
    ylab(feature) + 
    ggtitle("") + 
    theme(legend.position = "none", 
          axis.text.x = element_blank(), 
          axis.ticks.x = element_blank(), 
          axis.title.y = element_text(size = rel(1), angle = 0), 
          axis.text.y = element_text(size = rel(1)), 
          plot.margin = plot.margin ) 
  return(p)
}
#----- extract the max value of the y axis
vlnplot_max<- function(p){
  ymax<- max(ggplot_build(p)$layout$panel_scales_y[[1]]$range$range)
  return(ceiling(ymax))
}
#----- main function
vlnplot_stacked <- function(obj, features,
                          pt.size = 0, 
                          plot.margin = unit(c(-0.75, 0, -0.75, 0), "cm"),
                          ...){
  plot_list<- purrr::map(features, function(x) vlnplot_gg(obj = obj,
                                                          feature = x, ...))
  # Add back x-axis title to bottom plot. patchwork is going to support this?
  plot_list[[length(plot_list)]] <- plot_list[[length(plot_list)]] + 
                                      theme(axis.text.x=element_text(), 
                                            axis.ticks.x = element_line())
  # change the y-axis tick to only max value 
  ymaxs <- purrr::map_dbl(plot_list, vlnplot_max)
  plot_list <- purrr::map2(plot_list, ymaxs, function(x,y) x + 
                             scale_y_continuous(breaks = c(y)) + 
                             expand_limits(y = y))
  p <- patchwork::wrap_plots(plotlist = plot_list, ncol = 1)
  return(p)
}

#---------- Oligodendrocyte lineage (and OEG)
#----- OPC: Oligodendrocyte_Precursor_Cells
# Pdgfra Cluster 7
#----- OLG: Oligodendrocytes
# Cldn11 Cluster 0,4,5,20 (21)
#----- OEG: Olfactory_Ensheathing_Glia
# Npy Cluster 12
vlnplot_stacked(obj = seurat_markers, 
               features = c("Pdgfra", "Cldn11", "Npy"))

FeaturePlot(seurat_markers, 
            reduction = "umap",
            features = c("Pdgfra","Cldn11", "Npy"), 
            min.cutoff = "q10", 
            max.cutoff = "q90",
            label = TRUE)

#---------- Astrocyte lineage (and NSC)
#----- NSC: Neural_Stem_Cells
# Thbs4 Cluster (11topleft)
#----- ARP: Astrocyte_Restricted_Precursors
# Cd44 Cluster (19left)
#----- ASC: Astrocytes
# Gja1 Cluster 2,8,9,11 (12,21,24) 
vlnplot_stacked(obj = seurat_markers, 
               features = c("Thbs4", "Cd44", "Gja1"))

FeaturePlot(seurat_markers, 
            reduction = "umap",
            features = c("Thbs4", "Cd44", "Gja1"), 
            min.cutoff = "q10", 
            max.cutoff = "q90",
            label = TRUE)

#---------- Neuronal lineage
#----- NRP: Neuronal_Restricted_Precursors
# Cdk1 Cluster 7middlebottom
#----- NEUR_immature: Neurons_Immature
# Sox11 Cluster 18left
#----- NEUR_mature: Neurons_Mature
# Syt1 Cluster 3,10,15,23,18right
#----- NendC: Neuroendocrine_Cells
# Baiap3 Cluster (10middle)
vlnplot_stacked(obj = seurat_markers, 
               features = c("Cdk1", "Sox11", "Syt1", "Baiap3"))

FeaturePlot(seurat_markers, 
            reduction = "umap",
            features = c("Cdk1", "Sox11", "Syt1", "Baiap3"), 
            min.cutoff = "q10", 
            max.cutoff = "q90",
            label = TRUE)

#-------------------- Ependymal cells
#---------- EPC: Ependymocytes
# Ccdc153 Cluster 24top
#---------- HypEPC: Hypendymal_Cells
# Sspo Cluster 
#---------- TNC: Tanycytes
# Rax Cluster
#---------- CPC: Choroid_Plexus_Epithelial_Cells
# Ttr Cluster 24bottom
vlnplot_stacked(obj = seurat_markers, 
               features = c("Ccdc153", "Sspo", "Rax", "Ttr"))

FeaturePlot(seurat_markers, 
            reduction = "umap",
            features = c("Ccdc153", "Sspo", "Rax", "Ttr"), 
            min.cutoff = "q10", 
            max.cutoff = "q90",
            label = TRUE)

#-------------------- Vasculature cells
#---------- EC: Endothelial_Cells
# Cldn5 Cluster 6,13 (12)
#---------- PC: Pericytes
# Kcnj8 Cluster 22 (13)
#---------- Hb-VC: Hemoglobin_Expressing_Vascular_Cells
# Alas2 Cluster 
#---------- VSMC: Vascular_Smooth_Muscle_Cells
# Acta2 Cluster (24) (13bottomright)
#---------- VLMC: Vascular_And_Leptomeningeal_Cells
# Slc6a13 Cluster 21bottom
#---------- ABC: Arachnoid_Barrier_Cells
# Slc47a1 Cluster 21top
vlnplot_stacked(obj = seurat_markers, 
               features = c("Cldn5", "Kcnj8", "Alas2", "Acta2", "Slc6a13", "Slc47a1"))

FeaturePlot(seurat_markers, 
            reduction = "umap",
            features = c("Cldn5", "Kcnj8", "Alas2", "Acta2", "Slc6a13", "Slc47a1"), 
            min.cutoff = "q10", 
            max.cutoff = "q90",
            label = TRUE)

#-------------------- Immune cells
#---------- MG: Microglia
# Tmem119 Cluster 1,14
#---------- MNC: Monocytes
# Plac8 Cluster 19left
#---------- MAC: Macrophages
# Pf4 Cluster 17
#---------- DC: Dendritic_Cells
# Cd209a Cluster 19middle
#---------- NEUT: Neutrophils
# S100a9 Cluster 
vlnplot_stacked(obj = seurat_markers, 
               features = c("Tmem119", "Plac8", "Pf4", "Cd209a", "S100a9"))

FeaturePlot(seurat_markers, 
            reduction = "umap",
            features = c("Tmem119", "Plac8", "Pf4", "Cd209a", "S100a9"), 
            min.cutoff = "q10", 
            max.cutoff = "q90",
            label = TRUE)

#---------- UMAP
DimPlot(seurat_markers, reduction="umap", label = TRUE) + NoLegend()

#---------- Immune_Cells
# http://bio-bigdata.hrbmu.edu.cn/CellMarker/index.jsp
# https://panglaodb.se/index.html
# rownames(seurat_anno@assays$RNA)[grepl(pattern = "Cd1", rownames(seurat_anno@assays$RNA))]
#----- B_Cells
FeaturePlot(seurat_markers, 
            reduction = "umap",
            features = c("Cd79b"), 
            min.cutoff = "q10", 
            max.cutoff = "q90",
            label = TRUE)

#----- Macrophages
FeaturePlot(seurat_markers, 
            reduction = "umap",
            features = c("Cd14", "Cd68"), 
            min.cutoff = "q10", 
            max.cutoff = "q90",
            label = TRUE)

#----- T_Cells
FeaturePlot(seurat_markers, 
            reduction = "umap",
            features = c("Cd4","Trbc2"), 
            min.cutoff = "q10", 
            max.cutoff = "q90",
            label = TRUE)

Seurat Cluster_Annotation

#---------- Cluster_Annotation
#----- manually add annotations derived above
seurat_anno <- RenameIdents(seurat_markers, 
                            `0` = "Oligodendrocytes",
                            `1` = "Microglia",
                            `2` = "Astrocytes",
                            `3` = "Neurons_Mature",
                            `4` = "Oligodendrocytes",
                            `5` = "Oligodendrocytes",
                            `6` = "Endothelial_Cells",
                            `7` = "Oligodendrocyte_Precursor_Cells",
                            `8` = "Astrocytes",
                            `9` = "Astrocytes",
                            `10` = "Neurons_Mature",
                            `11` = "Astrocytes",
                            `12` = "Olfactory_Ensheathing_Glia",
                            `13` = "Endothelial_Cells",
                            `14` = "Microglia",
                            `15` = "Neurons_Mature",
                            `16` = "Neurons_Mature",
                            `17` = "Macrophages",
                            `18` = "Neurons_Immature",
                            `19` = "Monocytes",
                            `20` = "Oligodendrocytes",
                            `21` = "Vascular_And_Leptomeningeal_Cells",
                            `22` = "Pericytes",
                            `23` = "Neurons_Mature",
                            `24` = "EPC")
#----- save these annotations in meta.data
seurat_anno@meta.data$cluster_annotation <- Idents(seurat_anno)
#---------- manually edit annotations
DimPlot(seurat_anno, reduction="umap", label = TRUE) + NoLegend()
metadata <- seurat_anno@meta.data
metadata$cluster_annotation <- "Na"
anno <- DimPlot(seurat_anno, reduction="umap", label = TRUE) 
anno <- CellSelector(plot = anno)
metadata$cluster_annotation[metadata$cells %in% anno] <- "B_Cells"
seurat_anno@meta.data <- metadata
#----- Assign identity of clusters
Idents(object = seurat_anno) <- "cluster_annotation"
DimPlot(seurat_anno, reduction="umap", label = TRUE) + NoLegend()
#---- save rds
saveRDS(seurat_anno, file="seurat_results/seurat_anno.rds")
#----- Remove data
rm(seurat_markers,anno)
#---------- Cluster_Annotation
#----- Load data
seurat_anno <- readRDS(file = "seurat_results/seurat_anno.rds")
#-----  UMAP cell clusters
DimPlot(seurat_anno, 
        reduction="umap", 
        label = FALSE)

DimPlot(seurat_anno, 
        reduction="umap", 
        label = TRUE, 
        label.size = 3, 
        repel = TRUE) + NoLegend()

DimPlot(seurat_anno, 
        reduction="umap", 
        label = TRUE,
        split.by = "group",
        label.size = 3, 
        repel = TRUE) + NoLegend()

#----- Celltype Counts
table(seurat_anno@meta.data$cluster_annotation)
## 
##           Arachnoid_Barrier_Cells                        Astrocytes 
##                               274                             10436 
##                           B_Cells   Choroid_Plexus_Epithelial_Cells 
##                               113                                66 
##                   Dendritic_Cells                 Endothelial_Cells 
##                                24                              3538 
##                     Ependymocytes                       Macrophages 
##                               238                               457 
##                         Microglia                         Monocytes 
##                              5043                               143 
##    Neuronal_Restricted_Precursors                  Neurons_Immature 
##                                34                               275 
##                    Neurons_Mature        Olfactory_Ensheathing_Glia 
##                              6960                              1070 
##   Oligodendrocyte_Precursor_Cells                  Oligodendrocytes 
##                              2535                             12836 
##                         Pericytes                           T_Cells 
##                               337                               149 
## Vascular_And_Leptomeningeal_Cells 
##                               106
#----- may want to ask if we are truely seperating interneurons from excitatory
DotPlot(seurat_anno,
        features = c("Pdgfra",  # Oligodendrocyte_Precursor_Cells
                     "Cldn11",  # Oligodendrocytes
                     "Npy",     # Olfactory_Ensheathing_Glia
                     "Thbs4",   # Neural_Stem_Cells
                     "Cd44",    # Astrocyte_Restricted_Precursors
                     "Gja1",    # Astrocytes
                     "Cdk1",    # Neuronal_Restricted_Precursors
                     "Sox11",   # Neurons_Immature
                     "Syt1",    # Neurons_Mature
                     "Baiap3",  # Neuroendocrine_Cells
                     "Ccdc153", # Ependymocytes
                     "Sspo",    # Hypendymal_Cells
                     "Rax",     # Tanycytes
                     "Ttr",     # Choroid_Plexus_Epithelial_Cells
                     "Cldn5",   # Endothelial_Cells
                     "Kcnj8",   # Pericytes
                     "Alas2",   # Hemoglobin_Expressing_Vascular_Cells
                     "Acta2",   # Vascular_Smooth_Muscle_Cells
                     "Slc6a13", # Vascular_And_Leptomeningeal_Cells
                     "Slc47a1", # Arachnoid_Barrier_Cells
                     "Tmem119", # Microglia
                     "Plac8",   # Monocytes
                     "Pf4",     # Macrophages
                     "Cd209a",  # Dendritic_Cells
                     "S100a9",  # Neutrophils
                     "Cd79b",   # B_Cells
                     "Trbc2"),  # T_Cells
        cols = c("blue", "red"), 
        dot.scale = 8) + 
  RotatedAxis()

# If we want to remove a cell cluster
#seurat_subset <- subset(seurat_integrated, idents = "Endothelial_cells", invert = TRUE)

#---------- Cluster_Cell_Counts
# Cluster_Annotation_Number_of_Cells
seurat_anno@meta.data %>% 
  dplyr::count(cluster_annotation, sample) %>%  
  ggplot(aes(x=cluster_annotation, y=n, fill=sample)) + 
    geom_bar(position="dodge", stat="identity") + 
    ggtitle("Number of Cells") + 
    ylab("Cell Count") +
    theme_classic() + 
    facet_wrap(~cluster_annotation, scale="free", ncol = 4)

# Cluster_Annotation_Frequency_of_Cells
seurat_anno@meta.data %>% 
  dplyr::count(cluster_annotation, sample) %>% 
  group_by(sample) %>% 
  mutate(freq = n / sum(n)) %>% 
  ggplot(aes(x=cluster_annotation, y=freq, fill=sample)) + 
    geom_bar(position="dodge", stat="identity") + 
    ggtitle("Frequency of Cells") + 
    theme_classic() + 
    #theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    facet_wrap(~cluster_annotation, scale="free", ncol = 4)

Seurat Differential_Cell_Proportions

Celltype Proportion
Count of Celltype per sample / Count of all Celltypes per sample

#---------- Differential_Cell_Proportions
# beta regression models are designed for data between 0 and 1
# This give me percent differences
# emmeans(betafit,pairwise~Group)
# emmeans(betafit,pairwise~Group,type="response")
# To test that group is important, second way to be pvaule
# betafit1 <- betareg(Oligodendrocytes ~ Group, data = cellproportions)
# betafit2 <- betareg(Oligodendrocytes ~ 1, data = cellproportions)
# lrtest(betafit1,betafit2)
#----- Cell proportions
cellproportions <- seurat_anno@meta.data %>% 
  dplyr::count(cluster_annotation, sample) %>% 
  dplyr::group_by(sample) %>% 
  dplyr::mutate(freq = n / sum(n)) %>%  
  dplyr::select(-n) %>% 
  tidyr::spread(cluster_annotation,freq) %>% 
  replace(is.na(.), 0) %>% 
  cbind(data.frame(Group = rep(c("Old", "Young"),times=c(8,8))))
#----- Cell Types
cell_types <- seurat_anno@meta.data %>% 
  arrange(cluster_annotation) %>% 
  select(cluster_annotation) %>% 
  unique() %>% 
  pull(cluster_annotation)
#----- Cell proportion differences
cellproportion_differences <- lapply(cell_types, function(cell_type) {
  lmfit <- lm(as.formula(paste(cell_type, "~ Group")), data = cellproportions)
  offset <- cellproportions
  offset[,cell_type] <- offset[,cell_type] + 0.000001
  if (cell_type == "Neuronal_Restricted_Precursors") {
    results <- data.frame(Celltype = cell_type,
                          lm_pvalue = summary(lmfit)$coefficients[2,4],
                          lm_diff = summary(lmfit)$coefficients[2,1],
                          betareg_pvalue = NA,
                          betareg_change = NA)
  } else {
      betafit <- betareg(as.formula(paste(cell_type, "~ Group")), data = offset)  
    results <- data.frame(Celltype = cell_type,
                          lm_pvalue = summary(lmfit)$coefficients[2,4],
                          lm_diff = summary(lmfit)$coefficients[2,1],
                          betareg_pvalue = summary(betafit)$coefficients$mean[2,4],
                          betareg_change = exp(summary(betafit)$coefficients$mean[2,1]))
  }
  return(results)
}) %>% bind_rows()
#----- Save Results
write_delim(cellproportion_differences, 
            "seurat_results/Aging_Mice_GSE129788_cellproportion_differences.txt", 
            delim = "\t")
#---------- Differential_Cell_Proportions_Figures
cellproportion_differences <- 
  read_delim("seurat_results/Aging_Mice_GSE129788_cellproportion_differences.txt", 
             "\t", escape_double = FALSE, trim_ws = TRUE)
#----- Cell proportions
cellproportions <- seurat_anno@meta.data %>% 
  dplyr::count(cluster_annotation, sample) %>% 
  dplyr::group_by(sample) %>% 
  dplyr::mutate(freq = n / sum(n)) %>%  
  dplyr::select(-n) %>% 
  tidyr::spread(cluster_annotation,freq) %>% 
  replace(is.na(.), 0) %>% 
  cbind(data.frame(Group = rep(c("Old", "Young"),times=c(8,8))))
#----- Cell Types
cell_types <- seurat_anno@meta.data %>% 
  arrange(cluster_annotation) %>% 
  select(cluster_annotation) %>% 
  unique() %>% 
  pull(cluster_annotation)
#----- Cell proportions figure
cellproportions %>% 
  gather(key = "cluster_annotation", value = "freq", 2:(length(cell_types)+1)) %>% 
  ggplot(aes(x=cluster_annotation, y=freq, fill=Group)) + 
    geom_boxplot() + 
    theme_classic() + 
    scale_fill_brewer(palette="Dark2") +
    facet_wrap(~cluster_annotation, scale="free", ncol = 4)

#----- Searchable Table
cellproportion_differences %>% 
  mutate_if(is.numeric, ~round(.,6)) %>% 
  DT::datatable()

Seurat Conserved_Markers

Identification of all markers for each cluster
This type of analysis is typically recommended for when evaluating a single
sample group/condition. With the FindAllMarkers() function we are comparing each
cluster against all other clusters to identify potential marker genes. The cells
in each cluster are treated as replicates, and essentially a differential
expression analysis is performed with some statistical test. Also, by default
this function will return to you genes that exhibit both positive and negative
expression changes. Typically, we add an argument only.pos to opt for keeping only the positive changes.

Identification of conserved markers in all conditions
Since we have samples representing different conditions in our dataset, our best
option is to find conserved markers. This function internally separates out
cells by sample group/condition, and then performs differential gene expression
testing for a single specified cluster against all other clusters (or a second
cluster, if specified). Gene-level p-values are computed for each condition and
then combined across groups using meta-analysis methods from the MetaDE R
package. Before we start our marker identification we will explicitly set our
default assay, we want to use the original counts and not the integrated data.
DefaultAssay(seurat_anno) <- “RNA”

#---------- Conserved_Markers
# RNA stores the original and normalized counts for finding markers
# DefaultAssay(seurat_anno) <- "RNA"
# Ensembl & Symbol Gene IDs 
#---- Gene Names
dir <- "cellranger_mRNA/"
name <- list.files(path = "cellranger_mRNA") %>% 
             gsub(pattern = "_count", replacement = "")
Gene_Emsembl_Symbol <- read_tsv(paste0(dir,name[[1]],
                                "_count/outs/filtered_feature_bc_matrix/features.tsv.gz"),
                                col_names = c("Gene_Emsembl","Gene_Symbol","Assay_type"), 
                                col_types="ccc")[,c(1:2)] %>% 
                                unique()
#---- Cell Types
cell_types <- seurat_anno@meta.data %>% 
  arrange(cluster_annotation) %>% 
  select(cluster_annotation) %>% 
  unique() %>% 
  pull(cluster_annotation)

#---------- Conserved Markers across sample groups
get_conserved <- function(seurat_rna,cell_type){
  FindConservedMarkers(seurat_rna,
                       ident.1 = cell_type, 
                       grouping.var = "group", 
                       only.pos = TRUE, 
                       min.diff.pct = 0.1, 
                       min.pct = 0.1, 
                       logfc.threshold = 0.25) %>% 
    rownames_to_column(var = "Gene_Symbol") %>% 
    left_join(Gene_Emsembl_Symbol, by="Gene_Symbol") %>% 
    dplyr::mutate(Conserved_Markers = cell_type)
}
Conserved_Markers <- mclapply(cell_types, function(i) {
  get_conserved(seurat_rna = seurat_anno,
                cell_type = i) 
}) %>% bind_rows()
#---- Save Results
write_delim(Conserved_Markers, 
            "seurat_results/Aging_Mice_GSE129788_Conserved_Markers.txt", 
            delim = "\t")

#---------- Conserved Marker Pathways
get_gprofiler <- function(markers,ident_name){ 
   genes <- markers %>% 
      dplyr::filter(Conserved_Markers == ident_name)  
   gost(query = genes$Gene_Symbol, 
        organism = "mmusculus",
        correction_method = "g_SCS")$result %>% 
      dplyr::filter(p_value <= 0.05) %>% 
      dplyr::mutate(Conserved_Markers = ident_name) %>% 
      dplyr::select(-parents) 
}
Conserved_Markers_gprofiler_pathways <- mclapply(cell_types, function(i) {
  get_gprofiler(Conserved_Markers,i) 
}) %>% bind_rows()
#---- Save Results
write_delim(as.data.frame(Conserved_Markers_gprofiler_pathways), 
            "seurat_results/Aging_Mice_GSE129788_Conserved_Markers_gprofiler_pathways.txt", 
            delim = "\t")
#---------- Conserved_Markers_Counts
Conserved_Markers <- read_delim("seurat_results/Aging_Mice_GSE129788_Conserved_Markers.txt", 
                            "\t", escape_double = FALSE, trim_ws = TRUE)
#----- Number of Conserved_Markers
Conserved_Markers %>% 
  dplyr::count(Conserved_Markers) %>% 
  kable(caption="Number of Conserved_Markers", 
        format.args = list(big.mark = ",")) %>% 
  kable_styling(c("striped", "bordered"))
Number of Conserved_Markers
Conserved_Markers n
Arachnoid_Barrier_Cells 1,086
Astrocytes 440
B_Cells 255
Choroid_Plexus_Epithelial_Cells 731
Dendritic_Cells 552
Endothelial_Cells 1,203
Ependymocytes 1,411
Macrophages 660
Microglia 623
Monocytes 547
Neuronal_Restricted_Precursors 1,322
Neurons_Immature 635
Neurons_Mature 925
Olfactory_Ensheathing_Glia 543
Oligodendrocyte_Precursor_Cells 597
Oligodendrocytes 1,000
Pericytes 319
T_Cells 399
Vascular_And_Leptomeningeal_Cells 578
#----- Extract top 10 markers per cluster
# Conserved_Markers %>% 
#   group_by(Conserved_Markers) %>% 
#   top_n(n = 10, 
#         wt = max_pval) %>% 
#   kable(caption="Top10 Conserved_Markers", 
#         format.args = list(big.mark = ",")) %>% 
#   kable_styling(c("striped", "bordered"))
#----- Searchable table
Conserved_Markers %>% 
  mutate_if(is.numeric, ~round(.,4)) %>% 
  datatable()
#---------- Conserved_Markers_Gprofiler
#----- Read Pathways files
Conserved_Markers_gprofiler_pathways <- 
  read_delim("seurat_results/Aging_Mice_GSE129788_Conserved_Markers_gprofiler_pathways.txt", 
             "\t", escape_double = FALSE, trim_ws = TRUE)
#----- Conserved_Markers_Gprofiler_Figure
get_gprofilerFig <- function(markers,ident_name){
  genes <- markers %>% 
    filter(Conserved_Markers == ident_name)  
  gost(query = genes$Gene_Symbol, 
       organism = "hsapiens",
       correction_method = "g_SCS") %>% 
    gostplot(capped = FALSE, interactive = FALSE)
}   
mclapply(cell_types, function(i) {
  cat(paste0("#----- ",i))
  fig <- get_gprofilerFig(Conserved_Markers,i)
  fig$data$query <- i
  return(fig)
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

#----- Conserved_Markers_Gprofiler_Top10Figure
get_gprofilerTop10 <- function(pathway,ident_name){
  pathway %>% 
    filter(Conserved_Markers == ident_name) %>% 
    select(Conserved_Markers,
           source, 
           term_id, 
           term_name,
           term_size,
           query_size,
           intersection_size,
           recall,
           p_value) %>% 
    top_n(n = -10, wt = p_value) %>% 
    ggplot(aes(x=recall, 
               y=term_name, 
               colour=p_value, 
               size=intersection_size)) + 
      geom_point() + 
      expand_limits(x=0) +
      labs(x="Hits (%)", 
           y=ident_name, 
           colour="p value", 
           size="Count") +
      theme_bw()
}
mclapply(cell_types, function(i) {
  cat(paste0("#----- ",i))
  get_gprofilerTop10(Conserved_Markers_gprofiler_pathways,i)
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

#----- Conserved_Markers_Gprofiler_Pathways
Conserved_Markers_gprofiler_pathways %>% 
  select(Conserved_Markers,
         source, 
         term_id, 
         term_name,
         term_size,
         query_size,
         intersection_size,
         recall,
         p_value) %>% 
  mutate_if(is.numeric, ~round(.,6)) %>% 
  datatable()

Seurat Differential_Expression

Find DEG between old and young mice per cell_type
Using 0.25 logFC

#---------- Differential_Expression
#----- RNA Assay stores the original and normalized counts
# Original counts : seurat_anno@assays[["RNA"]]@counts
# Normalized counts : seurat_anno@assays[["RNA"]]@data
DefaultAssay(seurat_anno) <- "RNA"
#----- Group Celltype ID
seurat_anno$group_cluster <- paste(seurat_anno$group, seurat_anno$cluster_annotation, sep = "_")
# make this the Idents()
Idents(seurat_anno) <- seurat_anno$group_cluster

#---------- Find Differential Expressed Genes
# get_deg <- function(seurat_rna,ident1,ident2){
#   FindMarkers(seurat_rna, 
#               ident.1 = ident1, 
#               ident.2 = ident2, 
#               test.use = "MAST", 
#               latent.vars = NULL, 
#               logfc.threshold = 0, 
#               min.pct = 0,
#               min.diff.pct = -Inf, 
#               min.cells.feature = 10, 
#               min.cells.group = 10, 
#               verbose = FALSE) %>% 
#     rownames_to_column(var = "Gene_Symbol") %>% 
#     left_join(Gene_Emsembl_Symbol, by="Gene_Symbol") %>% 
#     mutate(Comparison = paste(ident1, ident2, sep = "_vs_"))
# }
# Differential_Expression <- mclapply(cell_types, function(i) {
#   id1 <- paste0("Old_",i)
#   id2 <- paste0("Young_",i)
#   get_deg(seurat_anno,id1,id2) 
# }) %>% bind_rows()
#----- Cell Types
cell_types <- seurat_anno@meta.data %>% 
  arrange(cluster_annotation) %>% 
  select(cluster_annotation) %>% 
  unique() %>% 
  pull(cluster_annotation)
#----- Remove cell types with less than 10 cell per group
cell_types <- cell_types[!cell_types %in% c("Dendritic_Cells", 
                               "Neuronal_Restricted_Precursors")]
#----- Cell_Type DEG
for (cell_type in cell_types){
  print(cell_type)
  data <- FindMarkers(seurat_anno, 
                      ident.1 = paste0("Old_",cell_type), 
                      ident.2 = paste0("Young_",cell_type), 
                      test.use = "MAST", 
                      latent.vars = NULL, 
                      logfc.threshold = 0, 
                      min.pct = 0,
                      min.diff.pct = -Inf, 
                      min.cells.feature = 10, 
                      min.cells.group = 10, 
                      verbose = FALSE) %>% 
    rownames_to_column(var = "Gene_Symbol") %>% 
    left_join(Gene_Emsembl_Symbol, by="Gene_Symbol") %>% 
    mutate(Comparison = paste("Old_vs_", "Young_",cell_type, sep = ""))
  assign(paste("Old_vs_", "Young_",cell_type, sep = ""), data)
}
for(i in seq(length(cell_types))){
  file <- paste("Old_vs_", "Young_",cell_types[i], sep = "")
  print(file)
  if(i == 1){
    Differential_Expression <- get(file)
  } else{
      Differential_Expression <- rbind(Differential_Expression, 
                                       get(file))
  }
}
#----- save the results
write_delim(Differential_Expression, 
  "seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST.txt", 
  delim = "\t")
#---------- Differential_Expression_Genes
#---- Read Marker files
Differential_Expression <- 
  read_delim("seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST.txt", 
             "\t", escape_double = FALSE, trim_ws = TRUE)
#---- Number of DEGs
merge(Differential_Expression %>% 
        dplyr::count(Comparison) %>%
        dplyr::rename(Total_Genes = n), 
      Differential_Expression %>% 
        dplyr::filter(p_val_adj <= 0.05 & avg_log2FC >= 0.25 | 
                      p_val_adj <= 0.05 & avg_log2FC <= -0.25) %>% 
        dplyr::count(Comparison) %>%
        dplyr::rename(Genes_FDR0.05_logFC0.25 = n), 
                      by = "Comparison") %>% 
  kable(caption="Number of Differentially Expressed Genes", 
        format.args = list(big.mark = ",")) %>% 
  kable_styling(c("striped", "bordered"))
Number of Differentially Expressed Genes
Comparison Total_Genes Genes_FDR0.05_logFC0.25
Old_vs_Young_Arachnoid_Barrier_Cells 13,539 2
Old_vs_Young_Astrocytes 15,194 59
Old_vs_Young_B_Cells 9,558 6
Old_vs_Young_Endothelial_Cells 15,329 169
Old_vs_Young_Ependymocytes 14,127 24
Old_vs_Young_Macrophages 13,400 6
Old_vs_Young_Microglia 14,607 136
Old_vs_Young_Monocytes 11,069 1
Old_vs_Young_Neurons_Mature 15,514 36
Old_vs_Young_Olfactory_Ensheathing_Glia 14,707 15
Old_vs_Young_Oligodendrocyte_Precursor_Cells 15,442 184
Old_vs_Young_Oligodendrocytes 14,715 88
Old_vs_Young_Pericytes 13,027 1
Old_vs_Young_T_Cells 11,091 14
#---- Extract top 10 DEGs per cluster
# Differential_Expression %>% 
#   group_by(Comparison) %>% 
#   top_n(n = -10, 
#         wt = p_val_adj) %>% 
#   kable(caption="Differentially Expressed Genes", 
#         format.args = list(big.mark = ",")) %>% 
#   kable_styling(c("striped", "bordered"))
#---- Searchable table
Differential_Expression %>% 
  filter(p_val_adj <= 0.05 & avg_log2FC >= 0.25 | 
         p_val_adj <= 0.05 & avg_log2FC <= -0.25) %>% 
  select(Comparison,
         Gene_Symbol, 
         Gene_Emsembl, 
         pct.1,
         pct.2,
         p_val,
         p_val_adj,
         avg_log2FC) %>% 
  mutate_if(is.numeric, ~round(.,6)) %>% 
  datatable()

Seurat Differential_Expression_GSEAranked

#---------- Differential_Expression_GSEAranked
#----- GMT File
# edit GMT file cut -f 2 -d$'\t' --complement
gmt_file <- gmtPathways("seurat_results/Mouse_GOBP_AllPathways_no_GO_iea_January_13_2021_symbol_fgsea.gmt")
#----- Comparisons 
comparisons = unique(Differential_Expression$Comparison)
#----- GSEAranked
Differential_Expression_GSEAranked <- mclapply(comparisons, function(i) {
  rnk_file <- Differential_Expression %>% 
    filter(Comparison == i) %>% 
    select(Comparison, 
           Gene_Symbol, 
           p_val, 
           avg_log2FC) %>% 
    mutate(sign_log10Pvalue = sign(avg_log2FC) * -log10(p_val)) %>% 
    arrange(sign_log10Pvalue)
  rnk_file <- setNames(rnk_file$sign_log10Pvalue, rnk_file$Gene_Symbol)
  fgsea_rnk <- fgsea(gmt_file, rnk_file, nperm=1000, minSize=10, maxSize=500)
  fgsea_rnk <- fgsea_rnk %>% 
    mutate(Comparison = i) %>% 
    select(-leadingEdge)
  return(fgsea_rnk)
}) %>% bind_rows()
#----- save the results
write_delim(Differential_Expression_GSEAranked, 
  "seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST_GSEAranked.txt", 
  delim = "\t")
#---------- Differential_Expression_GSEAranked
#----- GSEAranked
Differential_Expression_GSEAranked <- 
  read_delim("seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST_GSEAranked.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
#----- GSEAranked Number of Pathways
Differential_Expression_GSEAranked %>% 
  dplyr::filter(padj <= 0.05) %>% 
  dplyr::count(Comparison) %>%
  dplyr::rename(Pathways_FDR0.05 = n) %>% 
  kable(caption="Number of Pathways", 
        format.args = list(big.mark = ",")) %>% 
  kable_styling(c("striped", "bordered"))
Number of Pathways
Comparison Pathways_FDR0.05
Old_vs_Young_Oligodendrocyte_Precursor_Cells 191
#----- GSEAranked Searchable table
Differential_Expression_GSEAranked %>% 
  filter(padj <= 0.05) %>% 
  arrange(padj) %>% 
  select(Comparison, 
         pathway, 
         pval, 
         padj, 
         ES, 
         NES) %>% 
  mutate_if(is.numeric, ~round(.,4)) %>% 
  datatable()

Seurat Differential_Expression_Senescence_Pathways

Senescence Pathways

#---------- Differential_Expression_Senescence_Pathways
#---- Differential_Expression_Genes
Differential_Expression <- 
  read_delim("seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST.txt", 
             "\t", escape_double = FALSE, trim_ws = TRUE)
#----- Comparisons 
comparisons = unique(Differential_Expression$Comparison)
#----- Senescence Pathways 
senescence_pathways = list.files("../Senescence/mouse/")
#----- Senescence_Pathways_Enrichment
Senescence_Pathways_Enrichment <- 
  lapply(comparisons, function(i) {
    genes <- Differential_Expression %>% 
               filter(Comparison == i) %>% 
               nrow()
    genes_sig <- Differential_Expression %>% 
                   filter(p_val_adj <= 0.05 & avg_log2FC >= 0.25 | 
                          p_val_adj <= 0.05 & avg_log2FC <= -0.25) %>% 
                   filter(Comparison == i) %>% 
                   nrow()
    lapply(senescence_pathways, function(p) {
      pathway <- read_delim(paste0("../Senescence/mouse/",p), 
                            "\t", escape_double = FALSE, trim_ws = TRUE)
      genes_pathway <- Differential_Expression %>% 
                         filter(Comparison == i) %>% 
                         filter(Gene_Symbol %in% pathway$Gene_Symbol) %>% 
                         nrow()  
      genes_pathway_sig <- Differential_Expression %>% 
                             filter(p_val_adj <= 0.05 & avg_log2FC >= 0.25 | 
                                    p_val_adj <= 0.05 & avg_log2FC <= -0.25) %>% 
                             filter(Comparison == i) %>% 
                             filter(Gene_Symbol %in% pathway$Gene_Symbol) %>% 
                             nrow()  
      enrichment <- data.frame(rbind(Background=c(genes-genes_sig, genes_sig), 
                      Foreground= c(genes_pathway-genes_pathway_sig, genes_pathway_sig)))  
      names(enrichment) <- c("Nonsignificant","Significant")  
      df <- data.frame("Comparison" = i,
              "Genes" = genes,
              "Sig_Genes" = genes_sig,
              "Pathway" = p, 
              "Pathway_Genes" = nrow(pathway), 
              "Pathway_Back_Genes" = genes_pathway,
              "Pathway_Sig_Genes" = genes_pathway_sig,
              "Fisher_Pvalue" = fisher.test(enrichment, alternative="greater")$p.value,
              "Fisher_OddsRatio"=fisher.test(enrichment, alternative="greater")$estimate)
      return(df)
    })
  }) %>% bind_rows()
#----- save the results
write_delim(Senescence_Pathways_Enrichment, 
  "seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST_Senescence_Pathways_Enrichment.txt", 
  delim = "\t")
#---------- Differential_Expression_Senescence_Pathways_Counts
#---- Differential_Expression_Senescence_Pathways
Senescence_Pathways_Enrichment <- 
  read_delim("seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST_Senescence_Pathways_Enrichment.txt", 
             "\t", escape_double = FALSE, trim_ws = TRUE)
#----- Differential_Expression_Senescence_Pathways Number 
Senescence_Pathways_Enrichment %>% 
  dplyr::filter(Fisher_Pvalue <= 0.05) %>% 
  dplyr::count(Comparison) %>%
  dplyr::rename(Fisher_Pvalue0.05 = n) %>% 
  kable(caption="Number of Pathways", 
        format.args = list(big.mark = ",")) %>% 
  kable_styling(c("striped", "bordered"))
Number of Pathways
Comparison Fisher_Pvalue0.05
Old_vs_Young_Endothelial_Cells 6
Old_vs_Young_Ependymocytes 1
Old_vs_Young_Microglia 1
Old_vs_Young_Olfactory_Ensheathing_Glia 2
Old_vs_Young_Oligodendrocyte_Precursor_Cells 3
Old_vs_Young_Oligodendrocytes 1
#----- Differential_Expression_Senescence_Pathways Searchable table
Senescence_Pathways_Enrichment %>% 
  filter(Fisher_Pvalue <= 0.05) %>% 
  arrange(Fisher_Pvalue) %>% 
  mutate(Fisher_Pvalue = round(Fisher_Pvalue, 4), 
         Fisher_OddsRatio = round(Fisher_OddsRatio, 2)) %>%
  datatable()